Updated WebsiteAgent to receive events

Maximilian Clarke 11 years ago
parent
commit
9bf3c2c824
2 changed files with 34 additions and 7 deletions
  1. 20 6
      app/models/agents/website_agent.rb
  2. 14 1
      spec/models/agents/website_agent_spec.rb

+ 20 - 6
app/models/agents/website_agent.rb

@@ -4,7 +4,6 @@ require 'date'
4 4
 
5 5
 module Agents
6 6
   class WebsiteAgent < Agent
7
-    cannot_receive_events!
8 7
 
9 8
     default_schedule "every_12h"
10 9
 
@@ -46,6 +45,8 @@ module Agents
46 45
       Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance).  This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
47 46
 
48 47
       Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset.
48
+
49
+      The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload.
49 50
     MD
50 51
 
51 52
     event_description do
@@ -105,19 +106,23 @@ module Agents
105 106
     end
106 107
 
107 108
     def check
108
-      hydra = Typhoeus::Hydra.new
109 109
       log "Fetching #{options['url']}"
110
+      check_url options['url']
111
+    end
112
+
113
+    def check_url(in_url)
114
+      hydra = Typhoeus::Hydra.new
110 115
       request_opts = { :followlocation => true }
111 116
       request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
112 117
 
113 118
       requests = []
114 119
 
115
-      if options['url'].kind_of?(Array)
116
-        options['url'].each do |url|
120
+      if in_url.kind_of?(Array)
121
+        in_url.each do |url|
117 122
            requests.push(Typhoeus::Request.new(url, request_opts))
118 123
         end
119 124
       else
120
-        requests.push(Typhoeus::Request.new(options['url'], request_opts))
125
+        requests.push(Typhoeus::Request.new(in_url, request_opts))
121 126
       end
122 127
 
123 128
       requests.each do |request|
@@ -185,7 +190,7 @@ module Agents
185 190
               options['extract'].keys.each do |name|
186 191
                 result[name] = output[name][index]
187 192
                 if name.to_s == 'url'
188
-                  result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
193
+                  result[name] = URI.join(request.base_url, result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
189 194
                 end
190 195
               end
191 196
 
@@ -202,6 +207,13 @@ module Agents
202 207
       end
203 208
     end
204 209
 
210
+    def receive(incoming_events)
211
+      incoming_events.each do |event|
212
+        url_to_scrape = Utils.value_at(event['payload'], 'url')
213
+        check_url(url_to_scrape)
214
+      end
215
+    end
216
+
205 217
     private
206 218
 
207 219
     # This method returns true if the result should be stored as a new event.
@@ -275,5 +287,7 @@ module Agents
275 287
         false
276 288
       end
277 289
     end
290
+
278 291
   end
292
+
279 293
 end

+ 14 - 1
spec/models/agents/website_agent_spec.rb

@@ -331,6 +331,19 @@ describe Agents::WebsiteAgent do
331 331
         end
332 332
       end
333 333
     end
334
+
335
+    describe "#receive" do
336
+      it "should scrape from the url element in incoming event payload" do
337
+        @event = Event.new
338
+        @event.agent = agents(:bob_rain_notifier_agent)
339
+        @event.payload = { 'url' => "http://xkcd.com" }
340
+
341
+        lambda {
342
+          @checker.options = @site
343
+          @checker.receive([@event])
344
+        }.should change { Event.count }.by(1)
345
+      end
346
+    end
334 347
   end
335 348
 
336 349
   describe "checking with http basic auth" do
@@ -361,4 +374,4 @@ describe Agents::WebsiteAgent do
361 374
       end
362 375
     end
363 376
   end
364
-end
377
+end